Step 1: Data cleaning

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(dev.args = list(png = list(type = "cairo")))
library(uwot)
## Warning: package 'uwot' was built under R version 3.6.3
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.6.2
library(skimr)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(magrittr)
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library(gbm)
## Loaded gbm 2.1.5
data <- read.csv("~/My R environment/CALCTALUS/data_CALCTALUS.csv", encoding="UTF-8", row.names=1)
skim(data)
Data summary
Name data
Number of rows 180
Number of columns 33
_______________________
Column type frequency:
factor 1
numeric 32
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
SEX 0 1 FALSE 2 FEM: 93, MAL: 87

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
AGE 0 1.00 3.28 1.81 1.0 2.00 3.0 5.00 7.0 ▇▃▃▂▃
Sexo 0 1.00 1.48 0.50 1.0 1.00 1.0 2.00 2.0 ▇▁▁▁▇
TM1E 3 0.98 50.59 3.69 39.5 48.00 50.0 53.00 61.0 ▁▆▇▅▁
TM1D 2 0.99 50.87 3.67 39.0 48.00 50.5 53.00 60.5 ▁▃▇▆▂
TM2E 3 0.98 39.23 3.01 31.0 37.00 39.0 41.50 46.5 ▁▆▇▇▂
TM2D 2 0.99 39.38 2.89 32.0 37.00 39.0 41.38 46.0 ▁▆▇▇▂
TM3E 3 0.98 29.10 2.30 21.5 27.00 29.0 30.50 35.0 ▁▂▇▅▁
TM3D 2 0.99 29.18 2.27 21.0 27.12 29.0 31.00 36.0 ▁▅▇▅▁
TM3aE 3 0.98 30.54 2.36 23.0 29.00 30.0 32.00 37.5 ▁▃▇▅▁
TM3aD 2 0.99 30.55 2.34 22.0 29.00 30.5 32.00 38.0 ▁▃▇▅▁
TM4E 3 0.98 31.15 2.43 24.0 30.00 31.0 33.00 39.0 ▁▆▇▂▁
TM4D 2 0.99 31.38 2.49 24.0 30.00 31.0 33.00 38.5 ▁▅▇▅▁
TM5E 3 0.98 26.45 2.36 21.5 25.00 26.0 28.00 33.0 ▂▇▆▃▁
TM5D 2 0.99 26.20 2.34 21.0 24.42 26.0 28.00 32.5 ▂▇▆▆▁
CM1E 5 0.97 75.33 5.39 57.0 72.00 75.0 79.00 91.0 ▁▃▇▅▁
CM1D 5 0.97 75.63 5.61 57.0 72.00 76.0 79.00 92.5 ▁▃▇▃▁
CM1aE 5 0.97 70.73 5.26 53.0 67.25 70.0 73.75 86.0 ▁▃▇▅▁
CM1aD 5 0.97 70.41 5.26 53.0 67.00 70.0 74.00 86.0 ▁▃▇▅▁
CM2E 5 0.97 39.77 3.16 29.0 37.50 39.5 42.00 48.0 ▁▂▇▇▂
CM2D 4 0.98 39.76 3.04 29.5 38.00 39.5 42.00 48.0 ▁▂▇▆▁
CM4E 7 0.96 40.83 3.87 33.0 38.00 40.0 44.00 54.0 ▅▇▆▂▁
CM4D 5 0.97 40.65 3.87 32.0 38.00 40.0 43.00 53.0 ▃▇▆▃▁
CMSE 6 0.97 46.95 4.18 37.0 44.00 46.5 50.00 58.0 ▁▇▇▅▂
CMSD 4 0.98 46.61 3.95 37.0 44.00 46.0 49.62 56.0 ▁▇▇▅▂
CM5E 5 0.97 53.86 4.18 38.0 51.50 54.0 56.00 65.0 ▁▂▇▅▁
CM5D 5 0.97 53.95 4.17 38.0 51.25 54.0 56.50 64.0 ▁▂▇▇▂
CM7E 14 0.92 43.09 4.28 33.0 40.00 42.5 47.00 58.0 ▂▇▆▂▁
CM7D 15 0.92 43.83 4.33 33.0 41.00 44.0 47.00 55.5 ▂▇▇▆▁
CM8E 9 0.95 29.48 2.63 21.0 28.00 29.0 31.25 36.0 ▁▃▇▅▂
CM8D 12 0.93 29.48 2.67 21.0 28.00 29.0 31.12 36.0 ▁▃▇▅▁
CFCPCE 5 0.97 44.58 3.47 34.0 42.00 44.5 47.00 54.0 ▁▅▇▅▁
CFAPCD 4 0.98 45.05 3.62 34.0 43.00 45.0 47.00 56.0 ▁▃▇▃▁
xbuild <- data[,-c(1:3)]

set.seed(29)
rf <- randomForest(xbuild %>% na.roughfix, proximity = TRUE)
xCoordinates <- (1 - rf$proximity) %>% cmdscale()
cl <- kmeans(xCoordinates, 6)
plot(xCoordinates, col = cl$cluster)

new_x <- rfImpute(xbuild, cl$cluster %>% factor)[,-1]
## ntree      OOB      1      2      3      4      5      6
##   300:  17.78%  0.00% 11.43% 80.00% 39.13% 10.71% 13.79%
## ntree      OOB      1      2      3      4      5      6
##   300:  17.22%  0.00% 11.43% 73.33% 34.78% 17.86% 10.34%
## ntree      OOB      1      2      3      4      5      6
##   300:  17.78%  0.00% 14.29%100.00% 26.09% 10.71% 10.34%
## ntree      OOB      1      2      3      4      5      6
##   300:  18.89%  0.00% 14.29% 80.00% 34.78% 17.86% 13.79%
## ntree      OOB      1      2      3      4      5      6
##   300:  17.22%  0.00%  5.71% 80.00% 30.43% 17.86% 17.24%
# Now we do not have missing values anymore, plus mean and median remain unchanged, this a robust approach.

x_mod <- sapply(seq(2, ncol(new_x), 2), function(i) {
  rowMeans(new_x[,c(i, i - 1)], na.rm = T)
})

measurements <- data.frame(x_mod)

# get variable names

selected.vars <- sapply(seq(2, ncol(new_x), 2), function(i) {
  substr(colnames(new_x)[i], 0, nchar(colnames(new_x)[i])-1)
})

colnames(measurements) <- selected.vars

calctalus <- cbind(data[,1:2], measurements)
# write.csv2(calctalus, "calctalus.csv") # we could save the dataset at this point

data <- calctalus[,-1] # remove AGE, not relevant for this problem
machinedata <- data[,-1] # remove SEX, for data managing without the Y/output
datascaled <- scale(machinedata[,-1])
skim(data)
Data summary
Name data
Number of rows 180
Number of columns 16
_______________________
Column type frequency:
factor 1
numeric 15
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
SEX 0 1 FALSE 2 FEM: 93, MAL: 87

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
TM1 0 1 50.71 3.66 39.25 48.00 50.38 53.00 60.75 ▁▃▇▅▂
TM2 0 1 39.29 2.92 31.50 37.25 39.00 41.50 46.25 ▁▅▇▇▂
TM3 0 1 29.13 2.27 21.25 27.19 29.00 30.56 35.50 ▁▂▇▅▁
TM3a 0 1 30.53 2.34 22.50 29.00 30.25 32.25 37.75 ▁▃▇▅▁
TM4 0 1 31.25 2.40 24.00 29.75 31.00 33.00 38.75 ▁▅▇▅▁
TM5 0 1 26.32 2.31 21.25 24.50 26.00 28.00 32.50 ▂▇▇▃▁
CM1 0 1 75.50 5.43 57.00 72.00 75.29 78.52 90.50 ▁▂▇▅▁
CM1a 0 1 70.59 5.19 53.00 67.19 70.26 73.56 86.00 ▁▂▇▅▁
CM2 0 1 39.76 3.05 29.25 37.50 39.50 42.00 48.00 ▁▂▇▆▁
CM4 0 1 40.76 3.81 32.50 38.00 40.20 43.56 53.50 ▃▇▆▂▁
CMS 0 1 46.79 3.99 37.00 43.94 46.28 50.00 56.75 ▁▇▇▅▂
CM5 0 1 53.91 4.09 38.00 51.50 53.57 56.31 64.00 ▁▂▇▇▂
CM7 0 1 43.62 4.19 33.00 40.50 43.13 47.00 56.75 ▂▇▇▆▁
CM8 0 1 29.50 2.57 21.00 27.75 29.25 31.50 35.75 ▁▂▇▅▂
CFAPC 0 1 44.82 3.46 34.00 42.44 44.75 46.75 54.00 ▁▃▇▅▁

Step 2: Visualization of dataset proprieties

library(correlation)
library(ggraph)

gg <- machinedata %>% 
  correlation(partial = TRUE) %>% 
  plot()

gg + ggtitle("Gaussian Graphical Model (GGM) of CalcTalus dataset") +
  scale_edge_color_viridis(option = "plasma", name = "r (partial correlation)")
## Scale for 'edge_colour' is already present. Adding another scale for
## 'edge_colour', which will replace the existing scale.

library(ggfortify)

autoplot(prcomp(machinedata, scale. = T), data = calctalus, colour = 'SEX',
         loadings = T, loadings.label = T, loadings.colour = 'black') + theme_minimal()
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

ct.umap <- as.data.frame(umap(machinedata, n_components = 3, metric = "cosine",
                              init = "pca", spread = 24, n_epochs = 1024, min_dist = 0.5, y = data$SEX))
ct.umap$SEX <- data$SEX
library(plotly)
plot_ly(data = ct.umap, x = ~V1, y = ~V2, z = ~V3, color = ~SEX, type = 'scatter3d', mode = "markers")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

Step 3: Statistical modelling

library(caret)
library(gbm)
set.seed(29)
inTraining <- createDataPartition(data$SEX, p = .75, list = FALSE)
training <- data[ inTraining,]
testing  <- data[-inTraining,]

fitControl <- trainControl(method = "LOOCV")

set.seed(29)
gbmFit1 <- train(SEX ~ ., data = training, 
                 method = "gbm", 
                 trControl = fitControl,
                 verbose = FALSE)
gbmFit1
## Stochastic Gradient Boosting 
## 
## 136 samples
##  15 predictor
##   2 classes: 'FEMALE', 'MALE' 
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   n.trees  interaction.depth  Accuracy   Kappa    
##    50      1                  0.8529412  0.7053726
##    50      2                  0.8676471  0.7348354
##    50      3                  0.8750000  0.7494582
##   100      1                  0.8602941  0.7202252
##   100      2                  0.8676471  0.7346054
##   100      3                  0.8750000  0.7494582
##   150      1                  0.8750000  0.7496752
##   150      2                  0.8676471  0.7346054
##   150      3                  0.8823529  0.7640937
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  3, shrinkage = 0.1 and n.minobsinnode = 10.
varImp(gbmFit1)
## gbm variable importance
## 
##       Overall
## TM4   100.000
## TM1    99.823
## CMS    69.306
## CM1a   60.232
## TM3    38.943
## CM2    37.448
## CM5    24.136
## CFAPC  11.135
## TM5     8.051
## CM8     7.137
## CM7     6.993
## CM4     4.732
## CM1     2.301
## TM2     1.602
## TM3a    0.000
confusionMatrix(predict(gbmFit1, testing), testing$SEX)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FEMALE MALE
##     FEMALE     20    2
##     MALE        3   19
##                                           
##                Accuracy : 0.8864          
##                  95% CI : (0.7544, 0.9621)
##     No Information Rate : 0.5227          
##     P-Value [Acc > NIR] : 3.192e-07       
##                                           
##                   Kappa : 0.7727          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8696          
##             Specificity : 0.9048          
##          Pos Pred Value : 0.9091          
##          Neg Pred Value : 0.8636          
##              Prevalence : 0.5227          
##          Detection Rate : 0.4545          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.8872          
##                                           
##        'Positive' Class : FEMALE          
## 
rfFit <- train(SEX ~ ., data = training, 
               method = "rf", 
               trControl = fitControl,
               verbose = FALSE)
rfFit
## Random Forest 
## 
## 136 samples
##  15 predictor
##   2 classes: 'FEMALE', 'MALE' 
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8602941  0.7197397
##    8    0.8455882  0.6905072
##   15    0.8235294  0.6461405
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
varImp(rfFit)
## rf variable importance
## 
##       Overall
## TM1    100.00
## CMS     87.94
## TM4     74.97
## TM3a    61.14
## CM8     58.13
## CM1a    56.16
## TM3     43.85
## CM4     41.83
## TM5     40.35
## CFAPC   40.19
## CM1     32.44
## TM2     31.95
## CM2     28.33
## CM7     13.60
## CM5      0.00
confusionMatrix(predict(rfFit, testing), testing$SEX)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FEMALE MALE
##     FEMALE     21    1
##     MALE        2   20
##                                           
##                Accuracy : 0.9318          
##                  95% CI : (0.8134, 0.9857)
##     No Information Rate : 0.5227          
##     P-Value [Acc > NIR] : 4.385e-09       
##                                           
##                   Kappa : 0.8636          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9130          
##             Specificity : 0.9524          
##          Pos Pred Value : 0.9545          
##          Neg Pred Value : 0.9091          
##              Prevalence : 0.5227          
##          Detection Rate : 0.4773          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.9327          
##                                           
##        'Positive' Class : FEMALE          
## 
svmFit <- train(SEX ~ ., data = training, 
                method = "svmRadial", 
                trControl = fitControl)
svmFit  
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 136 samples
##  15 predictor
##   2 classes: 'FEMALE', 'MALE' 
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.25  0.8455882  0.6899696
##   0.50  0.8676471  0.7343750
##   1.00  0.8529412  0.7051171
## 
## Tuning parameter 'sigma' was held constant at a value of 0.1391779
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.1391779 and C = 0.5.
confusionMatrix(predict(svmFit, testing), testing$SEX)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FEMALE MALE
##     FEMALE     22    3
##     MALE        1   18
##                                           
##                Accuracy : 0.9091          
##                  95% CI : (0.7833, 0.9747)
##     No Information Rate : 0.5227          
##     P-Value [Acc > NIR] : 4.23e-08        
##                                           
##                   Kappa : 0.817           
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.9565          
##             Specificity : 0.8571          
##          Pos Pred Value : 0.8800          
##          Neg Pred Value : 0.9474          
##              Prevalence : 0.5227          
##          Detection Rate : 0.5000          
##    Detection Prevalence : 0.5682          
##       Balanced Accuracy : 0.9068          
##                                           
##        'Positive' Class : FEMALE          
## 

Supervised umap modeling

library(vizier)
set.seed(29)

train_umap <- umap(training, y = training$SEX, verbose = TRUE, n_components = 3, spread = 8, ret_model = TRUE)
## 13:50:53 UMAP embedding parameters a = 0.07163 b = 0.7918
## 13:50:53 Read 136 rows and found 15 numeric columns
## 13:50:53 Using Annoy for neighbor search, n_neighbors = 15
## 13:50:53 Building Annoy index with metric = euclidean, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 13:50:53 Writing NN index file to temp file C:\Users\delvi\AppData\Local\Temp\RtmpaACilG\file422c594f7e4c
## 13:50:53 Searching Annoy index using 4 threads, search_k = 1500
## 13:50:54 Annoy recall = 100%
## 13:50:54 Commencing smooth kNN distance calibration using 4 threads
## 13:50:54 Processing y data
## 13:50:54 Carrying out categorical intersection for 1 column
## 13:50:54 Applying categorical set intersection, weight = 0.5 far distance = 5
## 13:50:54 Initializing from normalized Laplacian + noise
## 13:50:54 Commencing optimization for 500 epochs, with 2394 positive edges
## 13:50:54 Optimization finished
vizier::embed_plotly(train_umap$embedding, as.factor(training$SEX))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
set.seed(29)
test_umap <- umap_transform(testing, train_umap, verbose = TRUE)
## 13:50:55 Read 44 rows and found 15 numeric columns
## 13:50:55 Processing block 1 of 1
## 13:50:55 Writing NN index file to temp file C:\Users\delvi\AppData\Local\Temp\RtmpaACilG\file422c77ae63fd
## 13:50:55 Searching Annoy index using 4 threads, search_k = 1500
## 13:50:55 Commencing smooth kNN distance calibration using 4 threads
## 13:50:55 Initializing by weighted average of neighbor coordinates using 4 threads
## 13:50:55 Commencing optimization for 167 epochs, with 660 positive edges
## 13:50:55 Finished
vizier::embed_plotly(test_umap, as.factor(testing$SEX))
training2 <- as.data.frame(train_umap$embedding)
training2$SEX <- training$SEX

set.seed(29)
gbmFit2 <- train(SEX ~ ., data = training2, 
                 method = "gbm", 
                 trControl = fitControl,
                 verbose = FALSE)
gbmFit2
## Stochastic Gradient Boosting 
## 
## 136 samples
##   3 predictor
##   2 classes: 'FEMALE', 'MALE' 
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   n.trees  interaction.depth  Accuracy  Kappa
##    50      1                  1         1    
##    50      2                  1         1    
##    50      3                  1         1    
##   100      1                  1         1    
##   100      2                  1         1    
##   100      3                  1         1    
##   150      1                  1         1    
##   150      2                  1         1    
##   150      3                  1         1    
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth =
##  1, shrinkage = 0.1 and n.minobsinnode = 10.
varImp(gbmFit2)
## gbm variable importance
## 
##    Overall
## V1     100
## V3       0
## V2       0
confusionMatrix(predict(gbmFit2, test_umap), testing$SEX)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FEMALE MALE
##     FEMALE     23    2
##     MALE        0   19
##                                           
##                Accuracy : 0.9545          
##                  95% CI : (0.8453, 0.9944)
##     No Information Rate : 0.5227          
##     P-Value [Acc > NIR] : 3.335e-10       
##                                           
##                   Kappa : 0.9085          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9048          
##          Pos Pred Value : 0.9200          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5227          
##          Detection Rate : 0.5227          
##    Detection Prevalence : 0.5682          
##       Balanced Accuracy : 0.9524          
##                                           
##        'Positive' Class : FEMALE          
## 
rfFit2 <- train(SEX ~ ., data = training2, 
               method = "rf", 
               trControl = fitControl,
               verbose = FALSE)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
rfFit2
## Random Forest 
## 
## 136 samples
##   3 predictor
##   2 classes: 'FEMALE', 'MALE' 
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy  Kappa
##   2     1         1    
##   3     1         1    
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
varImp(rfFit2)
## rf variable importance
## 
##    Overall
## V1   100.0
## V3    14.4
## V2     0.0
confusionMatrix(predict(rfFit2, test_umap), testing$SEX)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FEMALE MALE
##     FEMALE     23    2
##     MALE        0   19
##                                           
##                Accuracy : 0.9545          
##                  95% CI : (0.8453, 0.9944)
##     No Information Rate : 0.5227          
##     P-Value [Acc > NIR] : 3.335e-10       
##                                           
##                   Kappa : 0.9085          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9048          
##          Pos Pred Value : 0.9200          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5227          
##          Detection Rate : 0.5227          
##    Detection Prevalence : 0.5682          
##       Balanced Accuracy : 0.9524          
##                                           
##        'Positive' Class : FEMALE          
## 
svmFit2 <- train(SEX ~ ., data = training2, 
                method = "svmRadial", 
                trControl = fitControl)
svmFit2
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 136 samples
##   3 predictor
##   2 classes: 'FEMALE', 'MALE' 
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy  Kappa
##   0.25  1         1    
##   0.50  1         1    
##   1.00  1         1    
## 
## Tuning parameter 'sigma' was held constant at a value of 0.9823658
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.9823658 and C = 0.25.
varImp(svmFit2)
## ROC curve variable importance
## 
##    Importance
## V1    100.000
## V3      5.291
## V2      0.000
confusionMatrix(predict(svmFit2, test_umap), testing$SEX)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FEMALE MALE
##     FEMALE     23    3
##     MALE        0   18
##                                           
##                Accuracy : 0.9318          
##                  95% CI : (0.8134, 0.9857)
##     No Information Rate : 0.5227          
##     P-Value [Acc > NIR] : 4.385e-09       
##                                           
##                   Kappa : 0.8625          
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8571          
##          Pos Pred Value : 0.8846          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5227          
##          Detection Rate : 0.5227          
##    Detection Prevalence : 0.5909          
##       Balanced Accuracy : 0.9286          
##                                           
##        'Positive' Class : FEMALE          
##